{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GBDT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from cart import TreeNode, BinaryDecisionTree, ClassificationTree, RegressionTree\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from utils import feature_split, calculate_gini, data_shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "### GBDT定义\n",
    "class GBDT(object):\n",
    "    def __init__(self, n_estimators, learning_rate, min_samples_split,\n",
    "                 min_gini_impurity, max_depth, regression):\n",
    "        ### 常用超参数\n",
    "        # 树的棵树\n",
    "        self.n_estimators = n_estimators\n",
    "        # 学习率\n",
    "        self.learning_rate = learning_rate\n",
    "        # 结点最小分裂样本数\n",
    "        self.min_samples_split = min_samples_split\n",
    "        # 结点最小基尼不纯度\n",
    "        self.min_gini_impurity = min_gini_impurity\n",
    "        # 最大深度\n",
    "        self.max_depth = max_depth\n",
    "        # 默认为回归树\n",
    "        self.regression = regression\n",
    "        # 损失为平方损失\n",
    "        self.loss = SquareLoss()\n",
    "        # 如果是分类树，需要定义分类树损失函数\n",
    "        # 这里省略，如需使用，需自定义分类损失函数\n",
    "        if not self.regression:\n",
    "            self.loss = None\n",
    "        # 多棵树叠加\n",
    "        self.estimators = []\n",
    "        for i in range(self.n_estimators):\n",
    "            self.estimators.append(RegressionTree(min_samples_split=self.min_samples_split,\n",
    "                                             min_gini_impurity=self.min_gini_impurity,\n",
    "                                             max_depth=self.max_depth))\n",
    "    # 拟合方法\n",
    "    def fit(self, X, y):\n",
    "        # 前向分步模型初始化，第一棵树\n",
    "        self.estimators[0].fit(X, y)\n",
    "        # 第一棵树的预测结果\n",
    "        y_pred = self.estimators[0].predict(X)\n",
    "        # 前向分步迭代训练\n",
    "        for i in range(1, self.n_estimators):\n",
    "            gradient = self.loss.gradient(y, y_pred)\n",
    "            self.estimators[i].fit(X, gradient)\n",
    "            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))\n",
    "            \n",
    "    # 预测方法\n",
    "    def predict(self, X):\n",
    "        # 回归树预测\n",
    "        y_pred = self.estimators[0].predict(X)\n",
    "        for i in range(1, self.n_estimators):\n",
    "            y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))\n",
    "        # 分类树预测\n",
    "        if not self.regression:\n",
    "            # 将预测值转化为概率\n",
    "            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)\n",
    "            # 转化为预测标签\n",
    "            y_pred = np.argmax(y_pred, axis=1)\n",
    "        return y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "### GBDT分类树\n",
    "class GBDTClassifier(GBDT):\n",
    "      def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,\n",
    "                 min_info_gain=1e-6, max_depth=2):\n",
    "            super(GBDTClassifier, self).__init__(n_estimators=n_estimators,\n",
    "                                             learning_rate=learning_rate,\n",
    "                                             min_samples_split=min_samples_split,\n",
    "                                             min_gini_impurity=min_info_gain,\n",
    "                                             max_depth=max_depth,\n",
    "                                             regression=False)\n",
    "      # 拟合方法\n",
    "      def fit(self, X, y):\n",
    "            super(GBDTClassifier, self).fit(X, y)\n",
    "        \n",
    "### GBDT回归树\n",
    "class GBDTRegressor(GBDT):\n",
    "      def __init__(self, n_estimators=300, learning_rate=0.1, min_samples_split=2,\n",
    "                 min_var_reduction=1e-6, max_depth=3):\n",
    "        super(GBDTRegressor, self).__init__(n_estimators=n_estimators,\n",
    "                                            learning_rate=learning_rate,\n",
    "                                            min_samples_split=min_samples_split,\n",
    "                                            min_gini_impurity=min_var_reduction,\n",
    "                                            max_depth=max_depth,\n",
    "                                            regression=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 定义回归树的平方损失\n",
    "class SquareLoss():\n",
    "    # 定义平方损失\n",
    "    def loss(self, y, y_pred):\n",
    "        return 0.5 * np.power((y - y_pred), 2)\n",
    "    # 定义平方损失的梯度\n",
    "    def gradient(self, y, y_pred):\n",
    "        return -(y - y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Squared Error of NumPy GBRT: 84.29078032628252\n"
     ]
    }
   ],
   "source": [
    "### GBDT分类树\n",
    "# 导入sklearn数据集模块\n",
    "from sklearn import datasets\n",
    "# 导入波士顿房价数据集\n",
    "boston = datasets.load_boston()\n",
    "# 打乱数据集\n",
    "X, y = shuffle_data(boston.data, boston.target, seed=13)\n",
    "X = X.astype(np.float32)\n",
    "offset = int(X.shape[0] * 0.9)\n",
    "# 划分数据集\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
    "# 创建GBRT实例\n",
    "model = GBDTRegressor()\n",
    "# 模型训练\n",
    "model.fit(X_train, y_train)\n",
    "# 模型预测\n",
    "y_pred = model.predict(X_test)\n",
    "# 计算模型预测的均方误差\n",
    "mse = mean_squared_error(y_test, y_pred)\n",
    "print (\"Mean Squared Error of NumPy GBRT:\", mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Squared Error of sklearn GBRT: 14.885053466425939\n"
     ]
    }
   ],
   "source": [
    "# 导入sklearn GBDT模块\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "# 创建模型实例\n",
    "reg = GradientBoostingRegressor(n_estimators=200, learning_rate=0.5,\n",
    "                                 max_depth=4, random_state=0)\n",
    "# 模型拟合\n",
    "reg.fit(X_train, y_train)\n",
    "# 模型预测\n",
    "y_pred = reg.predict(X_test)\n",
    "# 计算模型预测的均方误差\n",
    "mse = mean_squared_error(y_test, y_pred)\n",
    "print (\"Mean Squared Error of sklearn GBRT:\", mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
